# Chapter 5

5.2.4

P1

(flights %>% 
  filter(
    arr_delay >= 200 # Had an arrival delay of two or more hours
  ))
## # A tibble: 2,832 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      848           1835       853     1001
##  2  2013     1     1     1815           1325       290     2120
##  3  2013     1     1     1842           1422       260     1958
##  4  2013     1     1     2006           1630       216     2230
##  5  2013     1     1     2115           1700       255     2330
##  6  2013     1     1     2205           1720       285       46
##  7  2013     1     1     2343           1724       379      314
##  8  2013     1     2     1244            900       224     1431
##  9  2013     1     2     1332            904       268     1616
## 10  2013     1     2     1412            838       334     1710
## # … with 2,822 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
(flights %>% 
  filter(
    dest %in% c('IAH', 'HOU') # Flew to Houston
  ))
## # A tibble: 9,313 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      623            627        -4      933
##  4  2013     1     1      728            732        -4     1041
##  5  2013     1     1      739            739         0     1104
##  6  2013     1     1      908            908         0     1228
##  7  2013     1     1     1028           1026         2     1350
##  8  2013     1     1     1044           1045        -1     1352
##  9  2013     1     1     1114            900       134     1447
## 10  2013     1     1     1205           1200         5     1503
## # … with 9,303 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
(flights %>% 
  filter(
    carrier %in% c('UA', 'AA', 'DL') # Were operated by United, American, or Delta
  ))
## # A tibble: 139,504 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      554            600        -6      812
##  5  2013     1     1      554            558        -4      740
##  6  2013     1     1      558            600        -2      753
##  7  2013     1     1      558            600        -2      924
##  8  2013     1     1      558            600        -2      923
##  9  2013     1     1      559            600        -1      941
## 10  2013     1     1      559            600        -1      854
## # … with 139,494 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
(flights %>% 
  filter(
    month %in% c(7, 8, 9) # Departed in summer
  ))
## # A tibble: 86,326 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     7     1        1           2029       212      236
##  2  2013     7     1        2           2359         3      344
##  3  2013     7     1       29           2245       104      151
##  4  2013     7     1       43           2130       193      322
##  5  2013     7     1       44           2150       174      300
##  6  2013     7     1       46           2051       235      304
##  7  2013     7     1       48           2001       287      308
##  8  2013     7     1       58           2155       183      335
##  9  2013     7     1      100           2146       194      327
## 10  2013     7     1      100           2245       135      337
## # … with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
(flights %>% 
  filter(
    arr_delay >= 200, dep_delay <= 0 # Arrived mored than two hours late, but didn't leave late
  ))
## # A tibble: 0 x 19
## # … with 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## #   sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
(flights %>% 
  filter(
    dep_delay >= 100, dep_delay - arr_delay > 30 # Were delayed by at least an hour, but made up over 30 minutes in flight
  ))
## # A tibble: 1,056 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1     2205           1720       285       46
##  2  2013     1     1     2326           2130       116      131
##  3  2013     1     3     1503           1221       162     1803
##  4  2013     1     3     1941           1759       102     2246
##  5  2013     1     3     2257           2000       177       45
##  6  2013     1     4     1917           1700       137     2135
##  7  2013     1     4     2010           1745       145     2257
##  8  2013     1     4     2058           1730       208        2
##  9  2013     1     4     2100           1920       100     2224
## 10  2013     1     4     2221           1858       203      116
## # … with 1,046 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
(flights %>% 
  filter(
    dep_time %% 2400 <= 600 # Departed between midnight and 6am
  ))
## # A tibble: 9,373 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
## # … with 9,363 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

P3

col_name <- colnames(flights)
na_count <- c()

for (i in 1:length(flights)) {
  na_count <- c(na_count, sum(is.na(flights[[i]])))
}

(result <- tibble(col_name, na_count) %>% filter(na_count > 0))
## # A tibble: 6 x 2
##   col_name  na_count
##   <chr>        <int>
## 1 dep_time      8255
## 2 dep_delay     8255
## 3 arr_time      8713
## 4 arr_delay     9430
## 5 tailnum       2512
## 6 air_time      9430

8,255 flights have a missing dep_time. Missing value in dep_time means that the flight did not departed (the flight was cancelled). Also dep_delay, arr_time, arr_delay, tailnum, air_time have missing values as above.

5.3.1

P1

arrange sorts boolean values in a way that TRUE values come after FALSE values. Thus, giving argument !is.na(COL_NAME) to arrange sorts data in a way that missing values in the column come first.

flights %>% 
  arrange(!is.na(dep_time))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1       NA           1630        NA       NA
##  2  2013     1     1       NA           1935        NA       NA
##  3  2013     1     1       NA           1500        NA       NA
##  4  2013     1     1       NA            600        NA       NA
##  5  2013     1     2       NA           1540        NA       NA
##  6  2013     1     2       NA           1620        NA       NA
##  7  2013     1     2       NA           1355        NA       NA
##  8  2013     1     2       NA           1420        NA       NA
##  9  2013     1     2       NA           1321        NA       NA
## 10  2013     1     2       NA           1545        NA       NA
## # … with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

5.4.1

P4

select(flights, contains("TIME"))
## # A tibble: 336,776 x 6
##    dep_time sched_dep_time arr_time sched_arr_time air_time
##       <int>          <int>    <int>          <int>    <dbl>
##  1      517            515      830            819      227
##  2      533            529      850            830      227
##  3      542            540      923            850      160
##  4      544            545     1004           1022      183
##  5      554            600      812            837      116
##  6      554            558      740            728      150
##  7      555            600      913            854      158
##  8      557            600      709            723       53
##  9      557            600      838            846      140
## 10      558            600      753            745      138
## # … with 336,766 more rows, and 1 more variable: time_hour <dttm>
select(flights, contains("TIME", ignore.case=FALSE))
## # A tibble: 336,776 x 0

dplyr::contains function has default setting of ignore.case = TRUE, which let the function ignore cases of the characters. Specifying ignore.case = TRUE let the function be case-sensitive.

5.5.2

P1

time_to_minutes <- function(time) {
  time_hour <- time %/% 100
  time_minute <- time %% 100
  
  time_hour * 60 + time_minute
}

flights %>% 
  mutate(
    dep_time_minutes = time_to_minutes(dep_time),
    sched_dep_time_minutes = time_to_minutes(sched_dep_time)
  ) %>% 
  select(dep_time, dep_time_minutes, sched_dep_time, sched_dep_time_minutes)
## # A tibble: 336,776 x 4
##    dep_time dep_time_minutes sched_dep_time sched_dep_time_minutes
##       <int>            <dbl>          <int>                  <dbl>
##  1      517              317            515                    315
##  2      533              333            529                    329
##  3      542              342            540                    340
##  4      544              344            545                    345
##  5      554              354            600                    360
##  6      554              354            558                    358
##  7      555              355            600                    360
##  8      557              357            600                    360
##  9      557              357            600                    360
## 10      558              358            600                    360
## # … with 336,766 more rows

P3

flights %>% 
  transmute(
    dep_sched_dep = time_to_minutes(dep_time) - time_to_minutes(sched_dep_time),
    dep_delay,
    check = dep_sched_dep == dep_delay
  )
## # A tibble: 336,776 x 3
##    dep_sched_dep dep_delay check
##            <dbl>     <dbl> <lgl>
##  1             2         2 TRUE 
##  2             4         4 TRUE 
##  3             2         2 TRUE 
##  4            -1        -1 TRUE 
##  5            -6        -6 TRUE 
##  6            -4        -4 TRUE 
##  7            -5        -5 TRUE 
##  8            -3        -3 TRUE 
##  9            -3        -3 TRUE 
## 10            -2        -2 TRUE 
## # … with 336,766 more rows

It’s easy to know that dep_time - sched_dep_time = dep_delay.

5.6.7

5

flights %>% 
  group_by(carrier) %>% 
  summarize(
    worst_dep_delay = max(dep_delay, na.rm = TRUE),
    worst_arr_delay = max(arr_delay, na.rm = TRUE)
  ) %>% 
  arrange(desc(worst_dep_delay), desc(worst_arr_delay))
## # A tibble: 16 x 3
##    carrier worst_dep_delay worst_arr_delay
##    <chr>             <dbl>           <dbl>
##  1 HA                 1301            1272
##  2 MQ                 1137            1127
##  3 AA                 1014            1007
##  4 DL                  960             931
##  5 F9                  853             834
##  6 9E                  747             744
##  7 VX                  653             676
##  8 FL                  602             572
##  9 EV                  548             577
## 10 B6                  502             497
## 11 US                  500             492
## 12 UA                  483             455
## 13 WN                  471             453
## 14 YV                  387             381
## 15 AS                  225             198
## 16 OO                  154             157

Carrier HA has both the worst dep_delayed and the worst arr_delayed flights ever.

flights %>% 
  group_by(carrier) %>% 
  summarize(
    mean_dep_delay = mean(dep_delay, na.rm = TRUE),
    mean_arr_delay = mean(arr_delay, na.rm = TRUE)
  ) %>% 
  arrange(desc(mean_dep_delay), desc(mean_arr_delay))
## # A tibble: 16 x 3
##    carrier mean_dep_delay mean_arr_delay
##    <chr>            <dbl>          <dbl>
##  1 F9               20.2          21.9  
##  2 EV               20.0          15.8  
##  3 YV               19.0          15.6  
##  4 FL               18.7          20.1  
##  5 WN               17.7           9.65 
##  6 9E               16.7           7.38 
##  7 B6               13.0           9.46 
##  8 VX               12.9           1.76 
##  9 OO               12.6          11.9  
## 10 UA               12.1           3.56 
## 11 MQ               10.6          10.8  
## 12 DL                9.26          1.64 
## 13 AA                8.59          0.364
## 14 AS                5.80         -9.93 
## 15 HA                4.90         -6.92 
## 16 US                3.78          2.13

In average, carrier F9 is the worst both in departure and arrival.

We can disentangle the effects of bad airports by normalizing the delays of flights by the mean delays of each airport.

flights %>% 
  filter(!is.na(dep_delay)) %>% 
  group_by(origin) %>%
  mutate(
    airport_delay_mean = mean(dep_delay),
    airport_delay_std = var(dep_delay)^(1/2),
    normal_dep_delay = (dep_delay - airport_delay_mean)/airport_delay_std
  ) %>% 
  group_by(carrier) %>% 
  summarize(
    delay = mean(normal_dep_delay)
  ) %>% 
  arrange(desc(delay))
## # A tibble: 16 x 2
##    carrier    delay
##    <chr>      <dbl>
##  1 F9       0.247  
##  2 YV       0.216  
##  3 FL       0.210  
##  4 EV       0.139  
##  5 WN       0.123  
##  6 9E       0.120  
##  7 OO       0.0304 
##  8 B6       0.0191 
##  9 VX      -0.00251
## 10 MQ      -0.0169 
## 11 UA      -0.0517 
## 12 DL      -0.0576 
## 13 AA      -0.0754 
## 14 HA      -0.185  
## 15 US      -0.195  
## 16 AS      -0.225

5.7.1

P5

flights_lagged <- flights %>%
  group_by(origin) %>% 
  arrange(year, month, day, dep_time) %>% 
  mutate(lagging_delay = lag(dep_delay))

lagged_delay <- flights_lagged %>% 
  group_by(lagging_delay) %>% 
  summarize(mean_lagged_delay = mean(dep_delay))

ggplot(lagged_delay, aes(lagging_delay, mean_lagged_delay)) +
  geom_point(na.rm = TRUE)

Delay of preceding flight is almost linearly related with the delay of following flight, for up to 250 minutes of preceding delay. From 250 to 500 minutes of preceding delay, the plot implies the positive relationship, but with much bigger variance. From 500 minutes of preceding delay, the following delay seems to be nearly 0.

Chapter 7

7.3.4

P2

binwidths = c(5, 10, 30, 50, 100, 200, 500, 1000)
gg_diamonds <- diamonds %>% 
  filter(price < 10000) %>% 
  ggplot(aes(x=price))

for (i in 1:length(binwidths)) {
  binwidth <- binwidths[[i]]
  plot <- gg_diamonds + 
    geom_histogram(binwidth = binwidth) +
    ggtitle(paste('Binwidth: ', binwidth))
  
  print(plot)
}

diamonds %>% 
  group_by(cut_width(price, 50)) %>% 
  summarize(n=n())
## # A tibble: 369 x 2
##    `cut_width(price, 50)`     n
##    <fct>                  <int>
##  1 [325,375]                105
##  2 (375,425]                407
##  3 (425,475]                768
##  4 (475,525]                935
##  5 (525,575]               1275
##  6 (575,625]               1373
##  7 (625,675]               1381
##  8 (675,725]               1503
##  9 (725,775]               1389
## 10 (775,825]               1295
## # … with 359 more rows

There is no diamond with the price between (1475, 1525], which is unusual. Also, there’s a slight increase in the counts of diamonds around from $3,750 to $5,000.

7.5.1.1

P1

flights %>% 
  mutate(
    cancelled = is.na(dep_time),
    sched_hour = sched_dep_time %/% 100,
    sched_min = sched_dep_time %% 100,
    sched_dep_time = sched_hour + sched_min/60
  ) %>% 
  ggplot(aes(sched_dep_time, ..density..)) +
  geom_freqpoly(aes(color=cancelled), binwidth=1)

The problem was, there are much more non-cancelled flights which made the comparison with counts difficult. In this case, by comparing density, or standardized count the comparison would be more plausible. The graph shows that flights are more tend to be cancelled at the afternoon. (specifically, after 3pm)

P2

vars <- colnames(diamonds)

for (i in 1:length(vars)) {
  plot <- diamonds %>% 
    ggplot(aes(x=diamonds[[ vars[[i]] ]], y=price)) +
    geom_point(alpha=1/10) +
    xlab(vars[[i]])
    
  print(plot)
}

Seemingly, “x” and “carat” is most important in predicting the price, as they seem to have exponential relationship with the price of the diamond. “y” and “z” also seem to have exponential relationship, but they have too short range. Note that “x” and “carat” both indicates the size of diamonds. We can easily know that “x” and “carat” of a diamond have quite high correlation. Then we can expect that the relationship between “x” and “cut” would be similar to the relationship between “carat” and “cut”.

diamonds %>% 
  ggplot(aes(x, carat)) +
  geom_point()

diamonds %>% 
  ggplot(aes(cut, carat)) +
  geom_boxplot() +
  coord_cartesian(ylim=c(0, 2))

Now the graph shows that the more ideal the cut, the smaller diamonds are, which makes ideal diamond cheaper that the fair one.

7.5.2.1

2

flights %>% 
  group_by(dest, month) %>% 
  mutate(avg_delay = mean(dep_delay, na.rm = TRUE)) %>% 
  ggplot(aes(dest, month)) +
  geom_tile(aes(fill=dep_delay))

There are too many destinations to fit in one graph.

library(d3heatmap)

flights %>% 
  group_by(month, dest) %>% 
  filter(n() >= 50) %>% 
  summarize(avg_delay=mean(dep_delay, na.rm = TRUE)) %>% 
  spread(dest, avg_delay) %>% 
  d3heatmap(., Rowv = FALSE, Colv = FALSE)

We can use d3heatmap library instead. It offers interactive graph. Also, we can filter some interesting destinations only.

7.5.3.1

5

ggplot(data = diamonds) +
  geom_bin2d(mapping = aes(x = x, y = y)) +
  coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))

ggplot(data = diamonds) +
  geom_point(mapping = aes(x = x, y = y)) +
  coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))

As a binned plot summarizes various data points in the bin, it is difficult to tell that a specific data point is unusual.

Chapter 10

10.5

P4

(annoying <- tibble(
  `1` = 1:10,
  `2` = `1` * 2 + rnorm(length(`1`))
))
## # A tibble: 10 x 2
##      `1`   `2`
##    <int> <dbl>
##  1     1  1.95
##  2     2  3.86
##  3     3  5.93
##  4     4  7.00
##  5     5  9.33
##  6     6 10.7 
##  7     7 14.9 
##  8     8 17.5 
##  9     9 18.0 
## 10    10 19.0
# 1.
annoying[, "1"]
## # A tibble: 10 x 1
##      `1`
##    <int>
##  1     1
##  2     2
##  3     3
##  4     4
##  5     5
##  6     6
##  7     7
##  8     8
##  9     9
## 10    10
# 2.
annoying %>% 
  ggplot(aes(`1`, `2`)) +
  geom_point()

# 3.
(annoying <- annoying %>% 
  mutate("3" = `2`/`1`))
## # A tibble: 10 x 3
##      `1`   `2`   `3`
##    <int> <dbl> <dbl>
##  1     1  1.95  1.95
##  2     2  3.86  1.93
##  3     3  5.93  1.98
##  4     4  7.00  1.75
##  5     5  9.33  1.87
##  6     6 10.7   1.78
##  7     7 14.9   2.13
##  8     8 17.5   2.18
##  9     9 18.0   2.00
## 10    10 19.0   1.90
# 4.
annoying %>% 
  rename("one" = "1", `two` = `2`, three = 3)
## # A tibble: 10 x 3
##      one   two three
##    <int> <dbl> <dbl>
##  1     1  1.95  1.95
##  2     2  3.86  1.93
##  3     3  5.93  1.98
##  4     4  7.00  1.75
##  5     5  9.33  1.87
##  6     6 10.7   1.78
##  7     7 14.9   2.13
##  8     8 17.5   2.18
##  9     9 18.0   2.00
## 10    10 19.0   1.90

Chapter 11

11.2.2

P5

# 3rd columns are truncated, because only 2 columns are assigned in header.
read_csv("a,b\n1,2,3\n4,5,6")
## Warning: 2 parsing failures.
## row col  expected    actual         file
##   1  -- 2 columns 3 columns literal data
##   2  -- 2 columns 3 columns literal data
## # A tibble: 2 x 2
##       a     b
##   <dbl> <dbl>
## 1     1     2
## 2     4     5
# 4th columns are truncated, and 3rd column of the first row is NA, because 3 columns are assigned.
read_csv("a,b,c\n1,2\n1,2,3,4")
## Warning: 2 parsing failures.
## row col  expected    actual         file
##   1  -- 3 columns 2 columns literal data
##   2  -- 3 columns 4 columns literal data
## # A tibble: 2 x 3
##       a     b     c
##   <dbl> <dbl> <dbl>
## 1     1     2    NA
## 2     1     2     3
# quote is not closed normally.
read_csv("a,b\n\"1")
## Warning: 2 parsing failures.
## row col                     expected    actual         file
##   1  a  closing quote at end of file           literal data
##   1  -- 2 columns                    1 columns literal data
## # A tibble: 1 x 2
##       a b    
##   <dbl> <chr>
## 1     1 <NA>
# 1, 2 of first row are saved as character, not integer, because a column should have one data type.
read_csv("a,b\n1,2\na,b")
## # A tibble: 2 x 2
##   a     b    
##   <chr> <chr>
## 1 1     2    
## 2 a     b
# We need to specify delimitor with read_delim() to break the value.
read_csv("a;b\n1;3")
## # A tibble: 1 x 1
##   `a;b`
##   <chr>
## 1 1;3

11.3.5

P4

# an xlsx file from django model export
orders = readxl::read_excel("./etc/orders.xlsx")

# Locale with date format and timezone setting
django_locale = locale(date_format = "%Y-%m-%d %H:%M:%S", tz="Asia/Seoul")
parse_datetime(orders$created, locale = django_locale)
##  [1] "2019-10-10 07:42:09 KST" "2019-10-10 04:35:56 KST"
##  [3] "2019-10-10 04:00:58 KST" "2019-10-10 02:19:50 KST"
##  [5] "2019-10-10 02:03:38 KST" "2019-10-10 00:08:25 KST"
##  [7] "2019-10-08 07:27:06 KST" "2019-10-08 06:09:34 KST"
##  [9] "2019-10-08 06:01:30 KST" "2019-10-08 04:16:22 KST"
## [11] "2019-10-08 04:11:52 KST" "2019-10-08 02:28:28 KST"
## [13] "2019-10-08 01:20:54 KST" "2019-10-07 07:51:22 KST"
## [15] "2019-10-07 01:15:57 KST" "2019-10-05 01:31:51 KST"
## [17] "2019-10-04 01:47:21 KST" "2019-10-04 00:52:27 KST"
## [19] "2019-10-03 04:39:21 KST" "2019-10-03 00:07:31 KST"
## [21] "2019-10-02 07:28:18 KST" "2019-10-02 07:04:23 KST"
## [23] "2019-10-02 03:32:26 KST" "2019-10-02 00:01:14 KST"
## [25] "2019-10-01 02:24:38 KST" "2019-10-01 02:03:48 KST"
## [27] "2019-10-01 01:57:54 KST" "2019-10-01 01:40:08 KST"
## [29] "2019-10-01 00:20:51 KST" "2019-09-30 07:18:22 KST"
## [31] "2019-09-30 04:53:14 KST" "2019-09-30 04:24:18 KST"
## [33] "2019-09-30 02:28:05 KST" "2019-09-30 02:13:26 KST"
## [35] "2019-09-30 00:31:26 KST" "2019-09-27 06:42:43 KST"
## [37] "2019-09-27 05:55:33 KST" "2019-09-27 03:51:09 KST"
## [39] "2019-09-27 03:36:24 KST" "2019-09-26 23:59:54 KST"
## [41] "2019-09-26 08:42:26 KST" "2019-09-26 05:08:54 KST"
## [43] "2019-09-26 04:11:57 KST" "2019-09-26 02:39:22 KST"
## [45] "2019-09-26 00:24:39 KST" "2019-09-25 06:45:46 KST"
## [47] "2019-09-25 02:12:37 KST" "2019-09-25 00:52:57 KST"
## [49] "2019-09-24 01:42:33 KST" "2019-09-23 06:39:42 KST"
## [51] "2019-09-23 06:05:55 KST"

P7

d1 <- "January 1, 2010"
d2 <- "2015-Mar-07"
d3 <- "06-Jun-2017"
d4 <- c("August 19 (2015)", "July 1 (2015)")
d5 <- "12/30/14" # Dec 30, 2014
t1 <- "1705"
t2 <- "11:15:10.12 PM"
parse_date(d1, "%B %d, %Y")
## [1] "2010-01-01"
parse_date(d2, "%Y-%b-%d")
## [1] "2015-03-07"
parse_date(d3, "%d-%b-%Y")
## [1] "2017-06-06"
parse_date(d4, "%B %d (%Y)")
## [1] "2015-08-19" "2015-07-01"
parse_date(d5, "%m/%d/%y")
## [1] "2014-12-30"
parse_time(t1, "%H%M")
## 17:05:00
parse_time(t2, "%H:%M:%OS %p")
## 23:15:10.12

Extra Questions

1

library(quantmod)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
options("getSymbols.warning4.0" = FALSE)
samsung_xts <- getSymbols("005930.KS", auto.assign = FALSE)
## Warning: 005930.KS contains missing values. Some functions will not work if
## objects contain missing values in the middle of the series. Consider using
## na.omit(), na.approx(), na.fill(), etc to remove or replace them.

a.

library(timetk)
samsung_tbl <- timetk::tk_tbl(samsung_xts, rename_index = "date")
head(samsung_tbl)
## # A tibble: 6 x 7
##   date       `005930.KS.Open` `005930.KS.High` `005930.KS.Low`
##   <date>                <dbl>            <dbl>           <dbl>
## 1 2007-01-02            12400            12540           12320
## 2 2007-01-03            12540            12560           12220
## 3 2007-01-04            12220            12240           12060
## 4 2007-01-05            12160            12180           11900
## 5 2007-01-08            11840            11880           11580
## 6 2007-01-09            11740            11880           11640
## # … with 3 more variables: `005930.KS.Close` <dbl>,
## #   `005930.KS.Volume` <dbl>, `005930.KS.Adjusted` <dbl>

We can convert xts object to tibble easily using library timetk.

ggplot(samsung_tbl, aes(date, `005930.KS.Close`)) +
  geom_line()

b.

quantmod::getFX("USD/KRW")
## [1] "USD/KRW"
quantmod::getFX("JPY/KRW")
## [1] "JPY/KRW"
jpykrw <- tk_tbl(JPYKRW, rename_index = "date")
usdkrw <- tk_tbl(USDKRW, rename_index = "date")

samsung_adjusted <- samsung_tbl %>% 
  inner_join(jpykrw) %>% 
  inner_join(usdkrw) %>% 
  filter(!is.na(`005930.KS.Close`)) %>% 
  mutate(
    in_krw = `005930.KS.Close`,
    in_jpy = in_krw/JPY.KRW,
    in_usd = in_krw/USD.KRW
  )
## Joining, by = "date"
## Joining, by = "date"
initial_krw <- samsung_adjusted$in_krw[1]
initial_jpy <- samsung_adjusted$in_jpy[1]
initial_usd <- samsung_adjusted$in_usd[1]

samsung_adjusted %>% 
  mutate(
    in_jpy_adj = in_jpy * (initial_krw / initial_jpy),
    in_usd_adj = in_usd * (initial_krw / initial_usd)
  ) %>% 
  ggplot(aes(date)) +
  geom_line(aes(y = in_krw, color="black")) +
  geom_line(aes(y = in_jpy_adj, color="red")) +
  geom_line(aes(y = in_usd_adj, color="blue")) +
  scale_color_discrete(
    name = "Currency",
    labels = c("KRW", "JPY", "USD")
  )

Using inner_join, we can extract only the common date points between the stock price data and the exchange rate data.

2

a.

(census <- read_csv("./etc/census.csv"))
## Warning: Duplicated column names deduplicated: '2018' => '2018_1' [4],
## '2018' => '2018_2' [5], '2018' => '2018_3' [6], '2018' => '2018_4' [7],
## '2018' => '2018_5' [8], '2018' => '2018_6' [9], '2018' => '2018_7' [10]
## Parsed with column specification:
## cols(
##   `행정구역별(읍면동)` = col_character(),
##   연령별 = col_character(),
##   `2018` = col_character(),
##   `2018_1` = col_character(),
##   `2018_2` = col_character(),
##   `2018_3` = col_character(),
##   `2018_4` = col_character(),
##   `2018_5` = col_character(),
##   `2018_6` = col_character(),
##   `2018_7` = col_character()
## )
## # A tibble: 305 x 10
##    `행정구역별(읍면동)`… 연령별 `2018` `2018_1` `2018_2` `2018_3` `2018_4`
##    <chr>            <chr>  <chr>  <chr>    <chr>    <chr>    <chr>   
##  1 행정구역별(읍면동)… 연령별 총인구(명… 총인구_남자(… 총인구_여자(… 총인구_성비… 내국인(명)…
##  2 서울특별시       합계   9,673… 4,718,9… 4,954,9… 95.2     9,299,5…
##  3    종로구     합계   157,9… 77,147   80,820   95.5     145,911 
##  4    중구       합계   129,7… 62,975   66,822   94.2     117,809 
##  5    용산구     합계   226,9… 110,685  116,253  95.2     207,812 
##  6    성동구     합계   306,7… 150,623  156,173  96.4     295,489 
##  7    광진구     합계   362,3… 175,762  186,542  94.2     342,575 
##  8    동대문구   합계   358,1… 176,323  181,818  97.0     338,293 
##  9    중랑구     합계   391,6… 193,599  198,069  97.7     385,082 
## 10    성북구     합계   438,7… 211,204  227,530  92.8     424,211 
## # … with 295 more rows, and 3 more variables: `2018_5` <chr>,
## #   `2018_6` <chr>, `2018_7` <chr>

Original data

col_names = c(
  "행정구역별(읍면동)",
  "연령별",
  "총인구(명)",
  "총인구_남자(명)",
  "총인구_여자(명)",
  "총인구_성비",
  "내국인(명)",
  "내국인_남자(명)",
  "내국인_여자(명)",
  "내국인_성비"
)

seoul2018 <- read_csv(
  "./etc/census.csv",
  skip = 3,
  n_max = 25,
  col_names = col_names
  )
## Parsed with column specification:
## cols(
##   `행정구역별(읍면동)` = col_character(),
##   연령별 = col_character(),
##   `총인구(명)` = col_number(),
##   `총인구_남자(명)` = col_number(),
##   `총인구_여자(명)` = col_number(),
##   총인구_성비 = col_double(),
##   `내국인(명)` = col_number(),
##   `내국인_남자(명)` = col_number(),
##   `내국인_여자(명)` = col_number(),
##   내국인_성비 = col_double()
## )
seoul2018
## # A tibble: 25 x 10
##    `행정구역별(읍면동)`… 연령별 `총인구(명)` `총인구_남자(명)`… `총인구_여자(명)`…
##    <chr>            <chr>         <dbl>            <dbl>            <dbl>
##  1    종로구     합계         157967            77147            80820
##  2    중구       합계         129797            62975            66822
##  3    용산구     합계         226938           110685           116253
##  4    성동구     합계         306796           150623           156173
##  5    광진구     합계         362304           175762           186542
##  6    동대문구   합계         358141           176323           181818
##  7    중랑구     합계         391668           193599           198069
##  8    성북구     합계         438734           211204           227530
##  9    강북구     합계         309138           150028           159110
## 10    도봉구     합계         328243           159569           168674
## # … with 15 more rows, and 5 more variables: 총인구_성비 <dbl>,
## #   `내국인(명)` <dbl>, `내국인_남자(명)` <dbl>, `내국인_여자(명)` <dbl>,
## #   내국인_성비 <dbl>

I skipped first 3 columns so that the peripheral information is gone and default column types are set appropriately, but which made me manually set the column names. Also with n_max option I saved only 25 rows from the data source.

b.

seoul2018tidy <- seoul2018 %>%
  select(-2)

seoul2018tidy
## # A tibble: 25 x 9
##    `행정구역별(읍면동)`… `총인구(명)` `총인구_남자(명)`… `총인구_여자(명)`…
##    <chr>                   <dbl>            <dbl>            <dbl>
##  1    종로구           157967            77147            80820
##  2    중구             129797            62975            66822
##  3    용산구           226938           110685           116253
##  4    성동구           306796           150623           156173
##  5    광진구           362304           175762           186542
##  6    동대문구         358141           176323           181818
##  7    중랑구           391668           193599           198069
##  8    성북구           438734           211204           227530
##  9    강북구           309138           150028           159110
## 10    도봉구           328243           159569           168674
## # … with 15 more rows, and 5 more variables: 총인구_성비 <dbl>,
## #   `내국인(명)` <dbl>, `내국인_남자(명)` <dbl>, `내국인_여자(명)` <dbl>,
## #   내국인_성비 <dbl>

The second column, whose value for all rows is “합계” is useless. So I removed the column. Except for that the data is tidy, with rows meaning the data of each district, columns meaning the values indicated by column names.

c.

Already done by default col_types option of read_csv. Parsing a numerical vector can be done like:

locale = locale(grouping_mark = ",")

parse_number("192,014,152", locale = locale)
## [1] 192014152

d.

pops <- seoul2018tidy[[2]]

(pops_n <- length(pops))
## [1] 25
(pops_mean <- mean(pops))
## [1] 386957.4
(pops_std <- var(pops)^(1/2))
## [1] 121146.4
random_normal_sample <- rnorm(pops_n, pops_mean, pops_std)

hist(pops)

hist(random_normal_sample)

Random normal sample looks more like bell shape, in which the data points are concentrated and are symmetric around the center(mean). The histogram of pops show the distribution in which the data points are concentrated arount the center, but more data points are put on the right side of the mean. Reference: https://www.datamentor.io/r-programming/examples/random-number/

e.

pops_mean
## [1] 386957.4
sample(pops, 10, replace=TRUE) %>% mean()
## [1] 393771
sample(pops, 10, replace=TRUE) %>% mean()
## [1] 410301.9

They are all different. That’s because the sample mean is a random variable.

f.

pops_mean
## [1] 386957.4
replicate(100, sample(pops, 10, replace=TRUE) %>% mean()) %>% mean
## [1] 389497.5
replicate(1000, sample(pops, 10, replace=TRUE) %>% mean()) %>% mean
## [1] 387100
replicate(10000, sample(pops, 10, replace=TRUE) %>% mean()) %>% mean
## [1] 386985.9

The average of sample mean gets more closer to the population mean as the number of replications gets larger. This phenomenon is called the law of large numbers, which shows that the sample mean is consistent estimator of the population mean.

g.

pops_mean
## [1] 386957.4
sample(pops, 10, replace=TRUE) %>% mean()
## [1] 374167
sample(pops, 25, replace=TRUE) %>% mean()
## [1] 395694.4
sample(pops, 100, replace=TRUE) %>% mean()
## [1] 399757.6

The numbers get closer to the population mean.

h.

replicate(10000, sample(pops, 10, replace=TRUE) %>% mean()) %>% 
  hist()

replicate(10000, sample(pops, 25, replace=TRUE) %>% mean()) %>% 
  hist()

replicate(10000, sample(pops, 100, replace=TRUE) %>% mean()) %>% 
  hist()

The histogram more looks like normal distribution than hist(pops). This phenomenon is called Central Limit Theorem, which states that the sample mean shows the normal distribution, N(mean, var/n).

i.

replicate(1000, sample(pops, 10, replace=TRUE) %>% mean()) %>% 
  quantile(probs = seq(0, 1, 0.025))
##       0%     2.5%       5%     7.5%      10%    12.5%      15%    17.5% 
## 224261.4 319930.1 328155.8 335329.0 340621.4 345162.5 350103.1 354434.0 
##      20%    22.5%      25%    27.5%      30%    32.5%      35%    37.5% 
## 356698.6 360396.3 362711.7 365394.5 367577.4 370471.6 374101.8 377307.7 
##      40%    42.5%      45%    47.5%      50%    52.5%      55%    57.5% 
## 379237.7 381629.3 384364.0 386994.1 388890.7 390537.6 392605.6 394408.3 
##      60%    62.5%      65%    67.5%      70%    72.5%      75%    77.5% 
## 396037.2 398994.7 402590.4 405861.1 408681.2 411167.9 413522.9 416760.9 
##      80%    82.5%      85%    87.5%      90%    92.5%      95%    97.5% 
## 420091.7 422978.2 426715.3 430780.1 435252.8 441631.9 448660.1 460804.7 
##     100% 
## 503071.7

204,885 is less the value of 2.5% quantile, which means the mean of 204,885 is extremely unlikely in the Seoul’s population data. That is, the probability that the sample mean is more far from the population mean than 204,885 is less than 5%. So I don’t think the data is from Seoul’s population.